\documentclass[t,12pt,aspectratio=169]{beamer} % 16:9 宽屏比例，适合现代投影
\usepackage{ctex} % 中文支持
\usepackage{amsmath, amssymb} % 数学公式与符号
\usepackage{graphicx}
\usepackage{url}
\usepackage{verbatim}

% 主题设置（推荐简洁风格）
\usetheme{Madrid}
\usecolortheme{default} % 可选：seahorse, beaver, dolphin 等

\title{R语言统计入门第11章：多元回归}
\author{PD ET AL}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}

\begin{frame}
  \titlepage
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{目录}

\begin{enumerate}
\item[11.0.]  多元线性回归模型的基本理论
\item[11.1.]  多维数据绘图
\item[11.2.]  模型设定与模型输出
\item[11.3.]  模型筛选
\end{enumerate}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{11.0.1 多元线性回归模型} 

\begin{itemize}

\item 设有自变量 $x_1,x_2,\cdots,x_p$ 和因变量 $y$. 
%\[ (x_{i1},x_{i2},\cdots, x_{ip},y_i), \,\, i=1,2,\cdots,n. \]

\item  设有 $n$ 组观测数据，写成表格的形式：
\begin{center}
\begin{tabular}{|c|cccc|c|} \hline 
变量  & $x_{1}$ & $x_{2}$ & $\cdots$ & $x_{p}$ & $y$ \\ \hline 
数据$1$ & $x_{11}$ & $x_{12}$ & $\cdots$ & $x_{1p}$ & $y_1$ \\ \hline 
数据$2$ & $x_{21}$ & $x_{22}$ & $\cdots$ & $x_{2p}$ & $y_2$ \\ \hline 
$\vdots$ & $\vdots$  & $\vdots$ & $\vdots$  & $\vdots$ & $\vdots$ \\ \hline
数据$n$ & $x_{n1}$ & $x_{n2}$ & $\cdots$ & $x_{np}$ & $y_n$ \\ \hline 
\end{tabular}
\end{center}

\item 多元线性回归模型如下，其中 $\beta_0,\beta_1,\cdots,\beta_p$ 为待估计的参数：
\[ {\color{red} y_i = \beta_0+\beta_1x_{i1}+\beta_2x_{i2}+\cdots+\beta_{p}x_{ip}+\varepsilon_i,\,\,\, i=1,2,\cdots,n} \]

\item 任务之一是用自变量 $x_1,x_2,\cdots,x_p$ 的{\color{red}线性函数}来解释和预测因变量 $y$.

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{11.0.2 多元线性回归模型的基本假设}  

\begin{enumerate}

\item 自变量 $x_1,x_2,\cdots,x_p$ 是确定的观测值，相互不影响。%具体地说，是指上述表格的 $p$ 个列向量线性无关，最好是互相垂直。

\item 因变量与自变量之间的关系是近似线性的，即 
\[ y_i = \beta_0+\beta_1x_{i1}+\beta_2x_{i2}+\cdots+\beta_{p}x_{ip}+\varepsilon_i,\,\,\, i=1,2,\cdots,n.\]

\item (Gauss-Makov条件) 误差项 $\varepsilon_1,\varepsilon_2,\cdots,\varepsilon_n$ 是均值为零、方差相同、且两两不相关的随机变量，即
{\color{red}
\begin{eqnarray*}
\left\{\begin{array}{ll}
\mathbb{E}(\varepsilon_i) = 0, \,\,\, \text{var}(\varepsilon_i) = \sigma^2, & i=1,2,\cdots n,\\
\text{cov}(\varepsilon_i,\varepsilon_j) = 0, &  i\neq j, \,\, i,j=1,2,\cdots n.
\end{array}\right.
\end{eqnarray*}}

\vspace{-0.2cm}

\item 我们还经常假设误差项服从独立同分布的正态分布，即 
\begin{eqnarray*}
\left\{\begin{array}{ll}
\varepsilon_i \sim N(0,\sigma^2), & i=1,2,\cdots n,\\
\text{cov}(\varepsilon_i,\varepsilon_j) = 0, &  i\neq j, \,\, i,j=1,2,\cdots n.
\end{array}\right.
\end{eqnarray*}

\end{enumerate}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.1.1. 多变量数据的画图 Plotting multivariate data}

\begin{itemize}
\item[1.] 问题：载入数据框 \,{\color{blue}\verb+cystfibr+}, 解释各变量的含义。

\item 解答：这是一项关于囊性纤维化病人的肺功能的数据。25位病人的年龄为7-23岁，每个病人的10个测量数据的含义参见书本第251页。

\vspace{0.3cm}

\item[2.] 问题：画出任意两个变量之间的散点图。
\item 解答：使用 \,{\color{blue}\verb+pairs()+} 函数，第一个参数为数据框。

{\small\color{blue}
\begin{verbatim}
> library(ISwR)
> cystfibr
> pairs(cystfibr)
> pairs(cystfibr,gap=0)
> pairs(cystfibr,gap=0,cex.labels=0.9)
\end{verbatim}
}

%%%.  \,{\color{blue}\verb++}


\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.1.2. 囊性纤维化病人的肺功能数据的两两散点图  }

\begin{figure}
\centering
\includegraphics[height=0.65\textheight, width=0.8\textwidth]{cystfibr-pairwise-plot.png}
\caption{Pairwise plots for cystic fibrosis data. }
\end{figure}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.2.1. 模型设定（数据与问题） Model specification }

\begin{itemize}
\item 问题：考虑 \,{\color{blue}\verb+cystfibr+} 数据。
\begin{enumerate}
\item  建立一个线性回归模型，用其余变量来解释变量 \,{\color{blue}\verb+pemax+}. 
\item  对这个回归模型进行方差分析。
\item  用逐步回归方法确定最优模型。
\end{enumerate}

\item 解答：使用 \,{\color{blue}\verb+lm()+} 函数，第一个参数为模型公式。

{\small\color{blue}
\begin{verbatim}
> attach(cystfibr)
> lm01<-lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)
> summary(lm01)
> anova(lm01)
> step(lm01,direction='both')
\end{verbatim}
}


\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.2.2. 线性回归模型的输出结果 }

{\scriptsize\color{blue}
\begin{verbatim}
    Call: 
    lm(formula = pemax ~ age + sex + height + weight + bmp + fev1 + rv + frc + tlc)
    
    Residuals:
        Min      1Q  Median      3Q     Max 
    -37.338 -11.532   1.081  13.386  33.405 
    
    Coefficients:
                Estimate Std. Error t value Pr(>|t|)
    (Intercept) 176.0582   225.8912   0.779    0.448
    age          -2.5420     4.8017  -0.529    0.604
    sex          -3.7368    15.4598  -0.242    0.812
    height       -0.4463     0.9034  -0.494    0.628
    weight        2.9928     2.0080   1.490    0.157
    ...
    
    Residual standard error: 25.47 on 15 degrees of freedom
    Multiple R-squared:  0.6373,	Adjusted R-squared:  0.4197 
    F-statistic: 2.929 on 9 and 15 DF,  p-value: 0.03195
\end{verbatim}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.2.3. 输出结果的理解 }

\begin{itemize}
\item[1.]  解释这个线性回归模型的输出结果。
\item 解答：第一部分是模型公式，第二部分是残差的四分位数，第三部分是每个自变量的回归系数和显著性检验，第四部分是模型的显著性检验，包括R方和p值。

\vspace{0.3cm}

\item[2.] 问题：为什么每个变量都不显著，但是模型是显著的？
\item 解答：$t$ 检验说明每个变量都可以从模型中删除，但是模型检验 $p$ 值又小于 0.05, 这 。。。

\vspace{0.3cm}

\item[3.] 问题：性别变量 \,{\color{blue}\verb+sex+} 是二元变量，将其转成因子型，得到同样的回归系数。如何理解这个系数？  
\item 解答：%这是女性相对于男性的增加值。

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.2.4. 方差分析的输出结果 }

{\footnotesize\color{blue}
\begin{verbatim}
    Analysis of Variance Table
    
    Response: pemax
              Df  Sum Sq Mean Sq F value   Pr(>F)   
    age        1 10098.5 10098.5 15.5661 0.001296 **
    sex        1   955.4   955.4  1.4727 0.243680   
    height     1   155.0   155.0  0.2389 0.632089   
    weight     1   632.3   632.3  0.9747 0.339170   
    bmp        1  2862.2  2862.2  4.4119 0.053010 . 
    fev1       1  1549.1  1549.1  2.3878 0.143120   
    rv         1   561.9   561.9  0.8662 0.366757   
    frc        1   194.6   194.6  0.2999 0.592007   
    tlc        1    92.4    92.4  0.1424 0.711160   
    Residuals 15  9731.2   648.7                    
\end{verbatim}
}


\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.2.5. 解释方差分析的输出结果 }

\begin{itemize}
\item[1.]  解释这个方差分析的输出结果。

\item 解答：年龄  \,{\color{blue}\verb+age+} 解释了很大一部分的方差。

\vspace{0.3cm}

\item[2.]  问题：为什么变量 \,{\color{blue}\verb+age+} 在 \,{\color{blue}\verb+t+} 检验中不显著，但在方差分析中显著？

\item 解答：

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.3.1. 模型搜索 Model search }

\begin{itemize}
\item[1.] 问题：人工向后消元法的思路是什么？
\item 解答：从全模型出发，依次删除最不相关的变量，看剩余变量是否变得显著。

\vspace{0.3cm}

\item[2.] 问题：逐步回归的思路是什么？
\item 解答：从全模型出发，按照一定准则，依次删除或增加一个变量，直到准则不再更优。


\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.3.2. 逐步回归的输出结果 }

{\scriptsize\color{blue}
\begin{verbatim}
    Step:  AIC=160.66
    pemax ~ weight + bmp + fev1 + rv
    
             Df Sum of Sq   RSS    AIC
    <none>                10355 160.66
    - rv      1    1183.6 11538 161.36
    + tlc     1     197.1 10158 162.18
    + height  1     191.0 10164 162.19
    + age     1     178.1 10176 162.22
    + frc     1       3.4 10351 162.65
    + sex     1       2.4 10352 162.65
    - bmp     1    3072.6 13427 165.15
    - fev1    1    3717.1 14072 166.33
    - weight  1   10930.2 21285 176.67
    
    Call: lm(formula = pemax ~ weight + bmp + fev1 + rv)
    Coefficients:
    (Intercept)       weight          bmp         fev1           rv  
        63.9467       1.7489      -1.3772       1.5477       0.1257 
\end{verbatim}
}    
    

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{11.3.3. 解释逐步回归的输出结果 }

\begin{itemize}

\item[1.]  问题：写出使用逐步回归方法找出的模型。
\item 解答：\,{\color{blue}\verb|pemax ~ weight + bmp + fev1 + rv|}

\vspace{0.3cm}

\item[2.] 问题：用人工向后消元法，得到的模型是什么？如何解释这个结论？
\item 解答：\,{\color{blue}\verb| pemax ~ age |} 或 \,{\color{blue}\verb| pemax ~ height |} 

对7-23岁的青少年，肺活量跟年龄和身高有很大关系。

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\end{document}
